Run Libraries

rm(list=ls())

library(pheatmap)
library(ggplot2)
library(dichromat)
library(dplyr)
library(ggrepel)
library(reshape2)
library(umap)
library(ggthemes)
library(cowplot)
library(DEP)
library(readr)
library(naniar)
library(SummarizedExperiment)
library(data.table)
library(readxl)
library(writexl)
library(stringr)
library(vsn)
library(rmarkdown)
library(tidyr)
library(Polychrome)
library(RColorBrewer)
# library(stringr)
# library(MetaboAnalystR)
# library(matrixStats)
# library(wesanderson)
# library(clusterProfiler)
# library(enrichplot)
# library(msigdbr)

Set working directories

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

# create directory for results
dir.create(file.path(getwd(),'results'), showWarnings = FALSE)
# create directory for plots
dir.create(file.path(getwd(),'plots'), showWarnings = FALSE)

Load data

Loading data including:

  • All templates: information on the tube ID, rack, visit, country, type of fluid, position
  • Shipment of all boxes file: information on the patient ID, visit, position
  • Raw data: lipidomics expression with info on the tube ID for CSF and Plasma

Step 1: merge info of all patients

The first step is to merge the information of the “All templates” and “Shipment of all boxes” files, and create 3 datasets (one for plasma, one for CSF, one for urine) with the information of Patient ID, Position, Biofluid, Visit, Tube number, Country, Rack and Box.

- CSF

## sample info - CSF
paged_table(lipidomics_CSF_all)
dim(lipidomics_CSF_all)
## [1] 406   8
lipidomics_CSF_all_summary <- lipidomics_CSF_all %>%
  group_by(Visit) %>%
  summarise(
    total_patients = n_distinct(Patient, na.rm = TRUE),
    total_tubes = n_distinct(Tube_number, na.rm = TRUE),
    total_patients_tubes_pairs = n_distinct(
      paste(Patient, Tube_number)[!is.na(Patient) & !is.na(Tube_number)]))
lipidomics_CSF_all_summary
# which tube IDs are missing for Patient IDs at V0
lipidomics_CSF_all_V0 = lipidomics_CSF_all %>%
  filter(Visit == "V0")
paged_table(lipidomics_CSF_all_V0 %>%
  filter(is.na(Tube_number) & !is.na(Patient)))
# check which Patient IDs are duplicated at V0
paged_table(lipidomics_CSF_all_V0 %>%
  filter(Patient %in% lipidomics_CSF_all_V0[duplicated(lipidomics_CSF_all_V0$Patient),]$Patient))
# which tube IDs are missing for Patient IDs at V1
lipidomics_CSF_all_V1 = lipidomics_CSF_all %>%
  filter(Visit == "V1")
paged_table(lipidomics_CSF_all_V1 %>%
  filter(is.na(Tube_number) & !is.na(Patient)))
# check which Patient IDs are duplicated at V1
paged_table(lipidomics_CSF_all_V1 %>%
  filter(Patient %in% lipidomics_CSF_all_V1[duplicated(lipidomics_CSF_all_V1$Patient),]$Patient))

- Plasma

## sample info - plasma
paged_table(lipidomics_plasma_all)
dim(lipidomics_plasma_all)
## [1] 431   8
lipidomics_plasma_all_summary <- lipidomics_plasma_all %>%
  group_by(Visit) %>%
   summarise(
    total_patients = n_distinct(Patient, na.rm = TRUE),
    total_tubes = n_distinct(Tube_number, na.rm = TRUE),
    total_patients_tubes_pairs = n_distinct(
      paste(Patient, Tube_number)[!is.na(Patient) & !is.na(Tube_number)]))
lipidomics_plasma_all_summary
# which tube IDs are missing for Patient IDs at V0
lipidomics_plasma_all_V0 = lipidomics_plasma_all %>%
  filter(Visit == "V0")
paged_table(lipidomics_plasma_all_V0 %>%
  filter(is.na(Tube_number) & !is.na(Patient)))
# check which Patient IDs are duplicated at V0
paged_table(lipidomics_plasma_all_V0 %>%
  filter(Patient %in% lipidomics_plasma_all_V0[duplicated(lipidomics_plasma_all_V0$Patient),]$Patient))
# which tube IDs are missing for Patient IDs at V1
lipidomics_plasma_all_V1 = lipidomics_plasma_all %>%
  filter(Visit == "V1")
paged_table(lipidomics_plasma_all_V1 %>%
  filter(is.na(Tube_number) & !is.na(Patient)))
# check which Patient IDs are duplicated at V1
paged_table(lipidomics_plasma_all_V1 %>%
  filter(Patient %in% lipidomics_plasma_all_V1[duplicated(lipidomics_plasma_all_V1$Patient),]$Patient))

- Urine

## sample info - urine
paged_table(lipidomics_urine_all)
dim(lipidomics_urine_all)
## [1] 430   8
lipidomics_urine_all_summary <- lipidomics_urine_all %>%
  group_by(Visit) %>%
  summarise(
    total_patients = n_distinct(Patient, na.rm = TRUE),
    total_tubes = n_distinct(Tube_number, na.rm = TRUE),
    total_patients_tubes_pairs = n_distinct(
      paste(Patient, Tube_number)[!is.na(Patient) & !is.na(Tube_number)]))
lipidomics_urine_all_summary
# which tube IDs are missing for Patient IDs at V0
lipidomics_urine_all_V0 = lipidomics_urine_all %>%
  filter(Visit == "V0")
paged_table(lipidomics_urine_all_V0 %>%
  filter(is.na(Tube_number) & !is.na(Patient)))
# check which Patient IDs are duplicated at V0
paged_table(lipidomics_urine_all_V0 %>%
  filter(Patient %in% lipidomics_urine_all_V0[duplicated(lipidomics_urine_all_V0$Patient),]$Patient))
# which tube IDs are missing for Patient IDs at V1
lipidomics_urine_all_V1 = lipidomics_urine_all %>%
  filter(Visit == "V1")
paged_table(lipidomics_urine_all_V1 %>%
  filter(is.na(Tube_number) & !is.na(Patient)))
# check which Patient IDs are duplicated at V1
paged_table(lipidomics_urine_all_V1 %>%
  filter(Patient %in% lipidomics_urine_all_V1[duplicated(lipidomics_urine_all_V1$Patient),]$Patient))

Step 2: aggregate all raw files

The second step is to aggregate all the raw data files together and save the batch information, in order to check later if there are any batch effects in the data.

- CSF

paged_table(raw_data_CSF_final)
dim(raw_data_CSF_final)
## [1] 174 894
# how many rows of QC
length(grep("QC",raw_data_CSF_final$Tube_number))
## [1] 30

- Plasma

paged_table(raw_data_plasma_final)
dim(raw_data_plasma_final)
## [1] 382 894
# how many rows of QC
length(grep("QC",raw_data_plasma_final$Tube_number))
## [1] 66

Overview of data

Step 3: Combine raw data files with patient info

The third step is to combine the raw data files with the patient information, and check the amount of samples per visit for each fluid.

- CSF

raw_data_CSF_IDs = raw_data_CSF_final %>%
  select(Tube_number,Batch)
raw_data_CSF_IDs <- raw_data_CSF_IDs %>%
  mutate(Tube_number4 = str_sub(Tube_number,-4)) %>% # last 4 digits
  filter(!str_detect(Tube_number, "QC")) # remove 30 QC IDs

# how many IDs in the raw data CSF
length(raw_data_CSF_IDs$Tube_number)
## [1] 144
# how many unique IDs in the raw data CSF
length(unique(raw_data_CSF_IDs$Tube_number))
## [1] 144
lipidomics_CSF_all <- lipidomics_CSF_all %>%
  mutate(Tube_number4 = str_sub(Tube_number,-4)) # last 4 digits
full_match_CSF = raw_data_CSF_IDs %>%
  left_join(lipidomics_CSF_all,by = "Tube_number") %>%
  mutate(match_type = ifelse(!is.na(Tube_number4.y),"Full",NA))
no_match_CSF = full_match_CSF %>%
  filter(is.na(match_type)) %>%
  dplyr::rename(Tube_number4 = Tube_number4.y) %>%
  select(-names(lipidomics_CSF_all)) %>%
  dplyr::rename(Tube_number4 = Tube_number4.x)
last4_match_CSF = no_match_CSF %>%
  left_join(lipidomics_CSF_all %>% 
              filter(!Tube_number %in% (full_match_CSF %>% filter(!is.na(match_type)) %>%
                                          pull(Tube_number))),
            by = "Tube_number4") %>%
  mutate(match_type = ifelse(!is.na(Tube_number),"Last4","No match"))

# final description of the samples available in the raw data without QC samples
combined_data_CSF = full_match_CSF %>%
  filter(!is.na(match_type)) %>%
  select(-c(Tube_number4.x,Tube_number4.y,match_type)) %>%
  bind_rows(last4_match_CSF %>% 
              select(-Tube_number) %>% 
              left_join(raw_data_CSF_IDs %>% select(Tube_number,Tube_number4) %>%
                          filter(!Tube_number %in% (full_match_CSF %>% filter(!is.na(match_type)) %>%
                                          pull(Tube_number)))) %>%
              select(Tube_number,Patient,Position,Biofluid,Visit,Country,Rack,Box,Batch)) %>%
  distinct()

paged_table(combined_data_CSF)
# how many IDs in the raw data CSF now 
length(combined_data_CSF$Tube_number)
## [1] 144
# how many unique IDs in the raw data CSF now
length(unique(combined_data_CSF$Tube_number))
## [1] 144
# how many unique Patient IDs
length(unique(na.omit(combined_data_CSF$Patient)))
## [1] 112
# how many tube numbers and patient IDs per V0 and V1
combined_data_CSF %>%
  group_by(Visit) %>%
  summarise(
    total_patients = n_distinct(Patient, na.rm = TRUE), # amount of patient IDs
    total_tubes = n_distinct(Tube_number, na.rm = TRUE), # amount of tube numbers
    total_patients_tubes_pairs = n_distinct(
      paste(Patient, Tube_number)[!is.na(Patient) & !is.na(Tube_number)])) # amount of pairs
# which Tube numbers at V1 do not have a patient ID
combined_data_CSF %>%
  filter(Visit == "V1") %>%
  filter(is.na(Patient))
# patients at V0 and V1
patients_CSF_V0 <- combined_data_CSF %>%
  filter(Visit == "V0") %>%
  pull(Patient) %>%
  unique()
patients_CSF_V1 <- combined_data_CSF %>%
  filter(Visit == "V1") %>%
  pull(Patient) %>%
  unique() %>% 
  na.omit()

# check if all patients at V1 are at V0
all(patients_CSF_V1 %in% patients_CSF_V0)
## [1] FALSE
patients_CSF_V1[!patients_CSF_V1 %in% patients_CSF_V0] # patient TR319
## [1] "TR319"
# check patient in the info_patient dataset
lipidomics_CSF_all[lipidomics_CSF_all$Patient %in% patients_CSF_V1[!patients_CSF_V1 %in% patients_CSF_V0],]

- Plasma

raw_data_plasma_IDs = raw_data_plasma_final %>%
  select(Tube_number,Batch)
raw_data_plasma_IDs <- raw_data_plasma_IDs %>%
  mutate(Tube_number4 = str_sub(Tube_number,-4)) %>% # last 4 digits
  filter(!str_detect(Tube_number, "QC")) # remove 66 QC IDs

# how many IDs in the raw data plasma
length(raw_data_plasma_IDs$Tube_number)
## [1] 316
# how many unique IDs in the raw data plasma
length(unique(raw_data_plasma_IDs$Tube_number))
## [1] 316
lipidomics_plasma_all <- lipidomics_plasma_all %>%
  mutate(Tube_number4 = str_sub(Tube_number,-4)) # last 4 digits
full_match_plasma = raw_data_plasma_IDs %>%
  left_join(lipidomics_plasma_all,by = "Tube_number") %>%
  mutate(match_type = ifelse(!is.na(Tube_number4.y),"Full",NA))
no_match_plasma = full_match_plasma %>%
  filter(is.na(match_type)) %>%
  dplyr::rename(Tube_number4 = Tube_number4.y) %>%
  select(-names(lipidomics_plasma_all)) %>%
  dplyr::rename(Tube_number4 = Tube_number4.x)
last4_match_plasma = no_match_plasma %>%
  left_join(lipidomics_plasma_all,by = "Tube_number4") %>%
  mutate(match_type = ifelse(!is.na(Tube_number),"Last4","No match"))

# final description of the samples available in the raw data without QC samples
combined_data_plasma = full_match_plasma %>%
  filter(!is.na(match_type)) %>%
  select(-c(Tube_number4.x,Tube_number4.y,match_type)) %>%
  bind_rows(last4_match_plasma %>% 
              select(-Tube_number) %>% 
              left_join(raw_data_plasma_IDs %>% select(Tube_number,Tube_number4)) %>%
              select(Tube_number,Patient,Position,Biofluid,Visit,Country,Rack,Box,Batch)) %>%
  distinct()

paged_table(combined_data_plasma)
# how many IDs in the raw data plasma now 
length(combined_data_plasma$Tube_number)
## [1] 316
# how many unique IDs in the raw data plasma now
length(unique(combined_data_plasma$Tube_number))
## [1] 316
# how many unique Patient IDs
length(unique(na.omit(combined_data_plasma$Patient)))
## [1] 199
# how many tube numbers and patient IDs per V0 and V1
combined_data_plasma %>%
  group_by(Visit) %>%
  summarise(
    total_patients = n_distinct(Patient, na.rm = TRUE), # amount of patient IDs
    total_tubes = n_distinct(Tube_number, na.rm = TRUE), # amount of tube numbers
    total_patients_tubes_pairs = n_distinct(
      paste(Patient, Tube_number)[!is.na(Patient) & !is.na(Tube_number)])) # amount of pairs
# which Tube numbers at V0 do not have a patient ID
combined_data_plasma %>%
  filter(Visit == "V0") %>%
  filter(is.na(Patient))
# which Tube numbers at V1 do not have a patient ID
combined_data_plasma %>%
  filter(Visit == "V1") %>%
  filter(is.na(Patient))
# patients at V0 and V1
patients_plasma_V0 <- combined_data_plasma %>%
  filter(Visit == "V0") %>%
  pull(Patient) %>%
  unique()
patients_plasma_V1 <- combined_data_plasma %>%
  filter(Visit == "V1") %>%
  pull(Patient) %>%
  unique() %>% 
  na.omit()

# check if all patients at V1 are at V0
all(patients_plasma_V1 %in% patients_plasma_V0)
## [1] FALSE
patients_plasma_V1[!patients_plasma_V1 %in% patients_plasma_V0] # patients TR207, TR116 and TR123
## [1] "TR207" "TR116" "TR123"
# check patient in the info_patient dataset
lipidomics_plasma_all[lipidomics_plasma_all$Patient %in% patients_plasma_V1[!patients_plasma_V1 %in% patients_plasma_V0],]

Step 4: Overview of the samples based on status

The fourth step is to give an overview of how many samples of eALS, PGMC, CTR and mimic we have per fluid, visit and country.

- CSF (the patient IDs given by Laura)

# IDs in CSF (131)
IDs_CSF <- read_delim("data input/export-2025-11-07-PREMODIALS-AKDTR_BRNO_CHUFR_HMCIL_HRO_KSSGCH_MRI_NIUSASSK/CSFAcquisition.csv",
                          delim = ";", escape_double = FALSE, trim_ws = TRUE) %>%
  filter(CsfSexists == 1) %>% pull(PatientID) %>% unique()
IDs_CSF_data_1 = GeneralDocumentation %>%
  select(PatientID,ParticipantCode) %>%
  filter(PatientID %in% IDs_CSF) %>%
  dplyr::rename(Patient = ParticipantCode) %>%
  left_join(lipidomics_CSF_all) %>%
  select(Patient,Visit,Position) %>%
  distinct() %>%
  left_join(all_participants_IDs %>% dplyr::rename(Patient = ParticipantCode)) %>%
  mutate(
    Country = dplyr::case_when(
      grepl("TR", Patient) ~ "Turkey",
      grepl("CH", Patient) ~ "Switzerland",
      grepl("DE", Patient) ~ "Germany",
      grepl("SK", Patient) ~ "Slovakia",
      grepl("FR", Patient) ~ "France",
      TRUE                 ~ NA_character_
    )) %>%
  arrange(Patient)

paged_table(IDs_CSF_data_1)
overview_country_CSF_1 = IDs_CSF_data_1 %>%
  group_by(Visit,Country) %>%
  summarise(n_unique_patients = n_distinct(Patient),.groups = "drop") %>%
  arrange(Visit,desc(n_unique_patients))
overview_country_CSF_1
overview_status_CSF_1 = IDs_CSF_data_1 %>%
  group_by(type,Visit,Country) %>% 
  distinct() %>%
  summarise(nr_samples = n_distinct(Patient)) %>%
  select(type, Visit,Country,nr_samples) %>%
  pivot_wider(
    names_from  = type,        
    values_from = nr_samples,  
    values_fill = 0) %>%
  select(Visit,Country,PGMC,CTR,ALS,mimic,C9orf72,SOD1,TARDBP,FUS,other,`NA`)
overview_status_CSF_1

- CSF (the patient IDs existing in the raw data)

IDs_CSF_data = combined_data_CSF %>%
  select(Patient,Visit,Tube_number) %>%
  distinct() %>%
  left_join(all_participants_IDs %>% dplyr::rename(Patient = ParticipantCode)) %>%
  mutate(
    Country = dplyr::case_when(
      grepl("TR", Patient) ~ "Turkey",
      grepl("CH", Patient) ~ "Switzerland",
      grepl("DE", Patient) ~ "Germany",
      grepl("SK", Patient) ~ "Slovakia",
      grepl("FR", Patient) ~ "France",
      TRUE                 ~ NA_character_
    )) %>%
  arrange(Patient)

paged_table(IDs_CSF_data)
overview_country_CSF = IDs_CSF_data %>%
  group_by(Visit,Country) %>%
  summarise(n_unique_patients = n_distinct(Patient),.groups = "drop") %>%
  arrange(Visit,desc(n_unique_patients))
overview_country_CSF
overview_status_CSF = IDs_CSF_data %>%
  group_by(type,Visit,Country) %>% 
  distinct() %>%
  summarise(nr_samples = n_distinct(Patient)) %>%
  select(type, Visit,Country,nr_samples) %>%
  pivot_wider(
    names_from  = type,        
    values_from = nr_samples,  
    values_fill = 0) %>%
  select(Visit,Country,PGMC,CTR,ALS,mimic,C9orf72,SOD1,TARDBP,FUS,other,`NA`)
overview_status_CSF

- Plasma (the patient IDs given by Laura)

# IDs in plasma (235)
IDs_plasma <- read_delim("data input/export-2025-11-07-PREMODIALS-AKDTR_BRNO_CHUFR_HMCIL_HRO_KSSGCH_MRI_NIUSASSK/BloodAcquisition.csv",
                          delim = ";", escape_double = FALSE, trim_ws = TRUE) %>%
  filter(BAexists == 1) %>% pull(PatientID) %>% unique()

IDs_plasma_data_1 = GeneralDocumentation %>%
  select(PatientID,ParticipantCode) %>%
  filter(PatientID %in% IDs_plasma) %>%
  dplyr::rename(Patient = ParticipantCode) %>%
  left_join(lipidomics_plasma_all) %>%
  select(Patient,Visit,Position) %>%
  distinct() %>%
  left_join(all_participants_IDs %>% dplyr::rename(Patient = ParticipantCode)) %>%
  mutate(
    Country = dplyr::case_when(
      grepl("TR", Patient) ~ "Turkey",
      grepl("CH", Patient) ~ "Switzerland",
      grepl("DE", Patient) ~ "Germany",
      grepl("SK", Patient) ~ "Slovakia",
      grepl("FR", Patient) ~ "France",
      TRUE                 ~ NA_character_
    )) %>%
  arrange(Patient)

paged_table(IDs_plasma_data_1)
overview_country_plasma_1 = IDs_plasma_data_1 %>%
  group_by(Visit,Country) %>%
  summarise(n_unique_patients = n_distinct(Patient),.groups = "drop") %>%
  arrange(Visit,desc(n_unique_patients))
overview_country_plasma_1
overview_status_plasma_1 = IDs_plasma_data_1 %>%
  group_by(type,Visit,Country) %>% 
  distinct() %>%
  summarise(nr_samples = n_distinct(Patient)) %>%
  select(type, Visit,Country,nr_samples) %>%
  pivot_wider(
    names_from  = type,        
    values_from = nr_samples,  
    values_fill = 0) %>%
  select(Visit,Country,PGMC,CTR,ALS,mimic,C9orf72,SOD1,TARDBP,FUS,other,`NA`)
overview_status_plasma_1

- Plasma (the patient IDs existing in the raw data)

IDs_plasma_data = combined_data_plasma %>%
  select(Patient,Visit,Tube_number) %>%
  distinct() %>%
  left_join(all_participants_IDs %>% dplyr::rename(Patient = ParticipantCode)) %>%
  mutate(
    Country = dplyr::case_when(
      grepl("TR", Patient) ~ "Turkey",
      grepl("CH", Patient) ~ "Switzerland",
      grepl("DE", Patient) ~ "Germany",
      grepl("SK", Patient) ~ "Slovakia",
      grepl("FR", Patient) ~ "France",
      TRUE                 ~ NA_character_
    )) %>%
  arrange(Patient)

paged_table(IDs_plasma_data)
overview_country_plasma = IDs_plasma_data %>%
  group_by(Visit,Country) %>%
  summarise(n_unique_patients = n_distinct(Patient),.groups = "drop") %>%
  arrange(Visit,desc(n_unique_patients))
overview_country_plasma
overview_status_plasma = IDs_plasma_data %>%
  group_by(type,Visit,Country) %>% 
  distinct() %>%
  summarise(nr_samples = n_distinct(Patient)) %>%
  select(type, Visit,Country,nr_samples) %>%
  pivot_wider(
    names_from  = type,        
    values_from = nr_samples,  
    values_fill = 0) %>%
  select(Visit,Country,PGMC,CTR,ALS,mimic,C9orf72,SOD1,TARDBP,FUS,other,`NA`)
overview_status_plasma

Bioinformatic analyses

Step 5: creation of summarised experiment datasets

The fifth step is to remove the QC rows, save the experimental setup (info of patient ID, batch, sample ID, visit, etc.), structure the lipidomics dataset to numeric and create summarised experiments datasets.

- CSF

# removal of QC rows
raw_data_CSF_use = raw_data_CSF_final %>%
  filter(!str_detect(Tube_number, "QC"))

tube_batch_CSF = raw_data_CSF_use[,1:2] %>%
  left_join(combined_data_CSF %>% select(Tube_number,Batch,Patient,Visit))
raw_data_CSF_use_mat = as.matrix(raw_data_CSF_use[,3:ncol(raw_data_CSF_use)])
rownames(raw_data_CSF_use_mat) = tube_batch_CSF %>% pull(Tube_number)

# matrix with raw lipidomics data (rows - lipids and column - tube ID)
lipidomics_CSF = as.data.frame(t(raw_data_CSF_use_mat))
dim(lipidomics_CSF)
## [1] 892 144
# make columns numeric
for(i in 1:ncol(lipidomics_CSF)){lipidomics_CSF[,i] = as.numeric(lipidomics_CSF[,i])}

#make a list for all dataframes
lipidomics_data_CSF = list(raw_data = lipidomics_CSF)

## summarized objects 
tube_batch_CSF = tube_batch_CSF %>% 
  left_join(IDs_CSF_data) %>%
  mutate(subtype = ifelse(type == "C9orf72","C9orf72",
                          ifelse(type == "SOD1","SOD1",
                                 ifelse(type == "FUS","FUS",
                                        ifelse(type == "other","other",
                                               ifelse(type == "TARDBP","TARDBP",NA)))))) %>%
  mutate(type = ifelse(type %in% c("C9orf72","SOD1","FUS","TARDBP","other"),
                       "PGMC",type)) %>%
  arrange(subtype) %>%
  distinct(Tube_number, Batch,Patient,PatientID,type, .keep_all = TRUE)

# Check which tube numbers do not have a patient ID or type assigned (majority uncertain PURE MOTOR SYMPTOM (PMS) / MND MIMIC / EARLY ALS (EALS))
tube_number_CSF_missing = tube_batch_CSF %>%
  filter(is.na(Patient) | is.na(type)) %>%
  arrange(Patient)
tube_number_CSF_missing
GeneralDocumentation %>%
  select(ParticipantCode,PatientID,PGMC,ALSuncertainty,ALSFUdiagnosis) %>%
  filter(ParticipantCode %in% tube_number_CSF_missing$Patient) %>%
  filter(!is.na(ParticipantCode))
abundance.columns <- 1:ncol(lipidomics_data_CSF$raw_data) # get abundance column numbers
      clin = data.frame(label = tube_batch_CSF$Tube_number, 
                        condition = tube_batch_CSF$type,
                        sub_condition = tube_batch_CSF$subtype,
                        batch = tube_batch_CSF$Batch)
      
      lipidomics_data_CSF$raw_data$name = lipidomics_data_CSF$raw_data$ID = rownames(lipidomics_data_CSF$raw_data)
      experimental.design = clin
      experimental.design <- experimental.design %>%
        group_by(condition) %>%            
        mutate(replicate = row_number()) %>%
        ungroup()
      
      lipidomics_data_CSF$se <- make_se(lipidomics_data_CSF$raw_data, abundance.columns, experimental.design)
lipidomics_data_CSF$se
## class: SummarizedExperiment 
## dim: 892 144 
## metadata(0):
## assays(1): ''
## rownames(892): CE(18:1-d7) Cholesterol(d7) ... PS(38:4)_PS(18:0/20:4)
##   PS(40:4)_PS(20:0/20:4)
## rowData names(2): ID name
## colnames(144): PGMC_1 PGMC_2 ... CTR_26 CTR_27
## colData names(6): label ID ... batch replicate
 writexl::write_xlsx(lipidomics_data_CSF$raw_data,"results/lipidomics_CSF_raw_data.xlsx")

- Plasma

# removal of QC rows
raw_data_plasma_use = raw_data_plasma_final %>%
  filter(!str_detect(Tube_number, "QC"))

tube_batch_plasma = raw_data_plasma_use[,1:2] %>%
  left_join(combined_data_plasma %>% select(Tube_number,Batch,Patient,Visit))
raw_data_plasma_use_mat = as.matrix(raw_data_plasma_use[,3:ncol(raw_data_plasma_use)])
rownames(raw_data_plasma_use_mat) = tube_batch_plasma %>% pull(Tube_number)

# matrix with raw lipidomics data (rows - lipids and column - tube ID)
lipidomics_plasma = as.data.frame(t(raw_data_plasma_use_mat))
dim(lipidomics_plasma)
## [1] 892 316
# make columns numeric
for(i in 1:ncol(lipidomics_plasma)){lipidomics_plasma[,i] = as.numeric(lipidomics_plasma[,i])}

#make a list for all dataframes
lipidomics_data_plasma = list(raw_data = lipidomics_plasma)

## summarized objects 
tube_batch_plasma = tube_batch_plasma %>% 
  left_join(IDs_plasma_data) %>%
  mutate(subtype = ifelse(type == "C9orf72","C9orf72",
                          ifelse(type == "SOD1","SOD1",
                                 ifelse(type == "FUS","FUS",
                                        ifelse(type == "other","other",
                                               ifelse(type == "TARDBP","TARDBP",NA)))))) %>%
  mutate(type = ifelse(type %in% c("C9orf72","SOD1","FUS","TARDBP","other"),
                       "PGMC",type)) %>%
  arrange(subtype) %>%
  distinct(Tube_number, Batch,Patient,PatientID,type, .keep_all = TRUE)

# Check which tube numbers do not have a patient ID or type assigned (majority uncertain PURE MOTOR SYMPTOM (PMS) / MND MIMIC / EARLY ALS (EALS))
tube_number_plasma_missing = tube_batch_plasma %>%
  filter(is.na(Patient) | is.na(type)) %>%
  arrange(Patient)
tube_number_plasma_missing
GeneralDocumentation %>%
  select(ParticipantCode,PatientID,PGMC,ALSuncertainty,ALSFUdiagnosis) %>%
  filter(ParticipantCode %in% tube_number_plasma_missing$Patient) %>%
  filter(!is.na(ParticipantCode))
abundance.columns <- 1:ncol(lipidomics_data_plasma$raw_data) # get abundance column numbers
      clin = data.frame(label = tube_batch_plasma$Tube_number, 
                        condition = tube_batch_plasma$type,
                        sub_condition = tube_batch_plasma$subtype,
                        batch = tube_batch_plasma$Batch)
      
      lipidomics_data_plasma$raw_data$name = lipidomics_data_plasma$raw_data$ID = rownames(lipidomics_data_plasma$raw_data)
      experimental.design = clin
      experimental.design <- experimental.design %>%
        group_by(condition) %>%            
        mutate(replicate = row_number()) %>%
        ungroup()
      
      lipidomics_data_plasma$se <- make_se(lipidomics_data_plasma$raw_data, abundance.columns, experimental.design)
 lipidomics_data_plasma$se
## class: SummarizedExperiment 
## dim: 892 316 
## metadata(0):
## assays(1): ''
## rownames(892): CE(18:1-d7) Cholesterol(d7) ... PS(38:4)_PS(18:0/20:4)
##   PS(40:4)_PS(20:0/20:4)
## rowData names(2): ID name
## colnames(316): PGMC_1 PGMC_2 ... mimic_31 ALS_89
## colData names(6): label ID ... batch replicate
 writexl::write_xlsx(lipidomics_data_plasma$raw_data,"results/lipidomics_plasma_raw_data.xlsx")

Step 6: missing analyses

The sixth step is to check for missing omics per patient and overall, filter lipids that are not quantified in less than 1/3 of the samples and normalise the data (using vsn).

- CSF

# heatmap missing 
vis_miss(lipidomics_data_CSF$raw_data,show_perc = TRUE,show_perc_col = TRUE,cluster = F)

ggsave("plots/missing_vis_miss_heatmap_CSF.png", width = 15, height = 10, units = "in")

# filter for lipids that are quantified in at least 2/3 of the samples
lipidomics_data_CSF$se_filtered = filter_proteins(lipidomics_data_CSF$se,"fraction",min = 0.66)

#dimensions of the data
dim(lipidomics_data_CSF$se)
## [1] 892 144
dim(lipidomics_data_CSF$se_filtered)
## [1] 186 144
# heatmap missing for filtered data
vis_miss(as.data.frame(assay(lipidomics_data_CSF$se_filtered)),show_perc = TRUE,show_perc_col = TRUE,cluster = F)

ggsave("plots/missing_vis_miss_heatmap_filtered_CSF.png", width = 12, height = 8, units = "in")

plot_frequency(lipidomics_data_CSF$se)

ggsave("plots/frequency_met_identification_raw_CSF.pdf", width = 11, height = 8, units = "in")
plot_frequency(lipidomics_data_CSF$se_filtered)

ggsave("plots/frequency_met_identification_filtered_CSF.pdf", width = 11, height = 8, units = "in")

# % missing per patient: 
round(apply(X = as.data.frame(assay(lipidomics_data_CSF$se)), function(x) sum(is.na(x)), MARGIN = 2) / nrow(as.data.frame(assay(lipidomics_data_CSF$se))) * 100 , 1)
##   PGMC_1   PGMC_2   PGMC_3   PGMC_4   PGMC_5   PGMC_6   PGMC_7   PGMC_8 
##     77.5     76.3     77.9     77.5     77.4     77.7     77.0     77.2 
##   PGMC_9  PGMC_10  PGMC_11  PGMC_12  PGMC_13  PGMC_14  PGMC_15  PGMC_16 
##     79.0     79.8     76.9     78.6     76.7     78.6     77.8     77.7 
##  PGMC_17  PGMC_18  PGMC_19  PGMC_20  PGMC_21  PGMC_22  PGMC_23  PGMC_24 
##     78.3     78.6     77.1     76.6     77.5     77.6     78.9     79.0 
##  PGMC_25  PGMC_26  PGMC_27  PGMC_28  PGMC_29  PGMC_30    CTR_1  mimic_1 
##     78.6     78.0     77.5     77.2     78.4     77.8     76.5     73.0 
##    CTR_2  mimic_2    ALS_1    CTR_3    NA._1    ALS_2    ALS_3    CTR_4 
##     77.1     75.8     73.8     73.3     78.3     75.9     76.5     76.9 
##  mimic_3    ALS_4    CTR_5    CTR_6  mimic_4  mimic_5    NA._2    ALS_5 
##     76.1     76.2     75.9     77.2     77.0     74.8     78.0     76.3 
##  mimic_6    ALS_6    ALS_7    CTR_7  mimic_7    CTR_8  mimic_8    ALS_8 
##     76.3     78.5     76.1     76.3     76.1     76.8     76.1     75.8 
##    ALS_9    CTR_9   ALS_10  mimic_9   ALS_11   ALS_12   ALS_13 mimic_10 
##     76.6     76.9     75.7     72.5     74.6     73.9     73.5     76.9 
##   ALS_14    NA._3   ALS_15   CTR_10 mimic_11   CTR_11   ALS_16   CTR_12 
##     78.3     76.2     78.4     77.7     78.4     78.6     75.0     72.9 
##   CTR_13   CTR_14 mimic_12   CTR_15   CTR_16    NA._4 mimic_13   ALS_17 
##     78.4     77.1     77.6     77.6     78.0     77.8     75.4     77.9 
##   ALS_18 mimic_14   CTR_17   ALS_19   ALS_20 mimic_15   ALS_21   ALS_22 
##     79.1     75.6     76.9     77.0     78.6     76.6     79.9     75.8 
##   CTR_18 mimic_16   ALS_23   ALS_24    NA._5   CTR_19   ALS_25   ALS_26 
##     77.9     76.0     76.2     75.3     77.9     77.7     78.1     78.8 
##   ALS_27   ALS_28    NA._6   ALS_29   ALS_30    NA._7   ALS_31   ALS_32 
##     78.5     79.3     80.0     78.0     79.0     78.1     77.5     78.0 
##   ALS_33   CTR_20    NA._8 mimic_17    NA._9   CTR_21   CTR_22   ALS_34 
##     75.2     79.0     77.9     77.5     76.5     77.2     80.4     81.1 
##   ALS_35   ALS_36   ALS_37   CTR_23   ALS_38   ALS_39   ALS_40 mimic_18 
##     76.6     74.4     79.3     79.1     77.1     78.7     77.9     78.8 
##   ALS_41   NA._10   NA._11   CTR_24   ALS_42   ALS_43   ALS_44   ALS_45 
##     77.5     79.4     78.1     79.0     77.8     76.0     76.3     77.7 
##   CTR_25 mimic_19 mimic_20   NA._12   NA._13   ALS_46   ALS_47   ALS_48 
##     78.8     79.8     77.9     59.3     78.3     78.4     78.0     76.6 
##   ALS_49   NA._14 mimic_21   ALS_50 mimic_22   ALS_51   CTR_26   CTR_27 
##     74.9     81.6     80.2     77.4     76.9     77.0     78.3     78.5
round(apply(X = as.data.frame(assay(lipidomics_data_CSF$se_filtered)), function(x) sum(is.na(x)), MARGIN = 2) / nrow(as.data.frame(assay(lipidomics_data_CSF$se_filtered))) * 100 , 1)
##   PGMC_1   PGMC_2   PGMC_3   PGMC_4   PGMC_5   PGMC_6   PGMC_7   PGMC_8 
##      1.6      2.2      5.4      1.1      2.2      2.2      0.5      2.7 
##   PGMC_9  PGMC_10  PGMC_11  PGMC_12  PGMC_13  PGMC_14  PGMC_15  PGMC_16 
##      6.5      6.5      7.0      3.2      4.8      3.8      3.8      3.8 
##  PGMC_17  PGMC_18  PGMC_19  PGMC_20  PGMC_21  PGMC_22  PGMC_23  PGMC_24 
##      3.2      4.8      2.2      0.5      2.2      2.7      7.0      3.8 
##  PGMC_25  PGMC_26  PGMC_27  PGMC_28  PGMC_29  PGMC_30    CTR_1  mimic_1 
##      3.2      3.8      3.8      4.8      3.2      2.7      2.2      0.0 
##    CTR_2  mimic_2    ALS_1    CTR_3    NA._1    ALS_2    ALS_3    CTR_4 
##      5.4      0.5      0.5      0.0      3.2      1.1      1.1      1.1 
##  mimic_3    ALS_4    CTR_5    CTR_6  mimic_4  mimic_5    NA._2    ALS_5 
##      1.6      2.7      1.1      2.7      1.6      2.2      3.8      1.1 
##  mimic_6    ALS_6    ALS_7    CTR_7  mimic_7    CTR_8  mimic_8    ALS_8 
##      2.2      4.8      1.6      1.1      1.1      2.2      1.1      0.5 
##    ALS_9    CTR_9   ALS_10  mimic_9   ALS_11   ALS_12   ALS_13 mimic_10 
##      1.1      1.6      1.1      2.2      0.5      0.5      0.5      1.6 
##   ALS_14    NA._3   ALS_15   CTR_10 mimic_11   CTR_11   ALS_16   CTR_12 
##      6.5      2.2      3.2      1.6      3.8      2.7      0.5      2.7 
##   CTR_13   CTR_14 mimic_12   CTR_15   CTR_16    NA._4 mimic_13   ALS_17 
##      2.7      1.6      3.8      2.2      2.7      3.8      1.1      2.2 
##   ALS_18 mimic_14   CTR_17   ALS_19   ALS_20 mimic_15   ALS_21   ALS_22 
##      6.5      1.1      1.1      1.1      4.3      2.2      8.1      1.1 
##   CTR_18 mimic_16   ALS_23   ALS_24    NA._5   CTR_19   ALS_25   ALS_26 
##      2.2      1.6      1.6      1.1      2.2      2.2      3.2      4.3 
##   ALS_27   ALS_28    NA._6   ALS_29   ALS_30    NA._7   ALS_31   ALS_32 
##      3.2      5.9      9.1      5.4      6.5      4.3      4.8      2.7 
##   ALS_33   CTR_20    NA._8 mimic_17    NA._9   CTR_21   CTR_22   ALS_34 
##      3.2      5.4      4.3      3.2      2.2      2.7     11.3     14.0 
##   ALS_35   ALS_36   ALS_37   CTR_23   ALS_38   ALS_39   ALS_40 mimic_18 
##      2.7      1.1      8.1      6.5      1.6      7.0      3.2      5.4 
##   ALS_41   NA._10   NA._11   CTR_24   ALS_42   ALS_43   ALS_44   ALS_45 
##      1.6      6.5      3.2      7.5      4.3      2.7      0.5      2.2 
##   CTR_25 mimic_19 mimic_20   NA._12   NA._13   ALS_46   ALS_47   ALS_48 
##      7.0      9.7      3.8      0.5      3.8      3.8      3.8      1.6 
##   ALS_49   NA._14 mimic_21   ALS_50 mimic_22   ALS_51   CTR_26   CTR_27 
##      0.5     17.7      8.6      3.2      1.6      3.2      4.8      5.4
#normalization
lipidomics_data_CSF$se_filt_norm <- normalize_vsn(lipidomics_data_CSF$se_filtered)
DEP::meanSdPlot(lipidomics_data_CSF$se_filt_norm)

writexl::write_xlsx(as.data.frame(assay(lipidomics_data_CSF$se_filtered)),"results/data_filtered_CSF.xlsx")
writexl::write_xlsx(as.data.frame(assay(lipidomics_data_CSF$se_filt_norm)),"results/data_filtered_normalized_CSF.xlsx")

# imputation
lipidomics_data_CSF$se_filt_impute <- impute(
  lipidomics_data_CSF$se_filt_norm,fun = "MinProb",q = 0.01)
## Imputing along margin 2 (samples/columns).
## [1] 0.4905946
writexl::write_xlsx(as.data.frame(assay(lipidomics_data_CSF$se_filt_impute)),"results/data_filtered_normalized_imputed_CSF.xlsx")

- Plasma

# heatmap missing 
vis_miss(lipidomics_data_plasma$raw_data,show_perc = TRUE,show_perc_col = TRUE,cluster = F)

ggsave("plots/missing_vis_miss_heatmap_plasma.png", width = 15, height = 10, units = "in")

# filter for lipids that are quantified in at least 2/3 of the samples
lipidomics_data_plasma$se_filtered = filter_proteins(lipidomics_data_plasma$se,"fraction",min = 0.66)

#dimensions of the data
dim(lipidomics_data_plasma$se)
## [1] 892 316
dim(lipidomics_data_plasma$se_filtered)
## [1] 684 316
# heatmap missing for filtered data
vis_miss(as.data.frame(assay(lipidomics_data_plasma$se_filtered)),show_perc = TRUE,show_perc_col = TRUE,cluster = F)

ggsave("plots/missing_vis_miss_heatmap_filtered_plasma.png", width = 12, height = 8, units = "in")

plot_frequency(lipidomics_data_plasma$se)

ggsave("plots/frequency_met_identification_raw_plasma.pdf", width = 11, height = 8, units = "in")
plot_frequency(lipidomics_data_plasma$se_filtered)

ggsave("plots/frequency_met_identification_filtered_plasma.pdf", width = 11, height = 8, units = "in")

# % missing per patient: 
round(apply(X = as.data.frame(assay(lipidomics_data_plasma$se)), function(x) sum(is.na(x)), MARGIN = 2) / nrow(as.data.frame(assay(lipidomics_data_plasma$se))) * 100 , 1)
##   PGMC_1   PGMC_2   PGMC_3   PGMC_4   PGMC_5   PGMC_6   PGMC_7   PGMC_8 
##     16.6     18.7     19.7     19.1     19.5     18.8     20.0     20.7 
##   PGMC_9  PGMC_10  PGMC_11  PGMC_12  PGMC_13  PGMC_14  PGMC_15  PGMC_16 
##     22.0     22.6     23.1     23.7     22.3     21.1     21.3     22.2 
##  PGMC_17  PGMC_18  PGMC_19  PGMC_20  PGMC_21  PGMC_22  PGMC_23  PGMC_24 
##     22.4     22.3     27.9     29.1     24.3     27.5     28.4     24.4 
##  PGMC_25  PGMC_26  PGMC_27  PGMC_28  PGMC_29  PGMC_30  PGMC_31  PGMC_32 
##     26.9     26.8     24.8     23.1     22.1     22.6     22.1     16.8 
##  PGMC_33  PGMC_34  PGMC_35  PGMC_36  PGMC_37  PGMC_38  PGMC_39  PGMC_40 
##     16.7     16.7     18.6     20.2     19.3     17.8     18.0     18.5 
##  PGMC_41  PGMC_42  PGMC_43  PGMC_44  PGMC_45  PGMC_46  PGMC_47  PGMC_48 
##     20.4     20.0     16.6     18.2     18.5     19.1     17.9     18.5 
##  PGMC_49  PGMC_50  PGMC_51  PGMC_52  PGMC_53  PGMC_54  PGMC_55  PGMC_56 
##     17.9     19.8     23.8     25.0     20.0     20.1     21.5     21.3 
##  PGMC_57  PGMC_58  PGMC_59  PGMC_60  PGMC_61  PGMC_62  PGMC_63  PGMC_64 
##     22.4     23.0     27.5     17.4     15.5     17.9     18.2     16.4 
##  PGMC_65  PGMC_66  PGMC_67  PGMC_68  PGMC_69  PGMC_70  PGMC_71  PGMC_72 
##     16.8     17.5     18.6     16.7     18.0     23.3     23.0     16.1 
##  PGMC_73  PGMC_74  PGMC_75  PGMC_76  PGMC_77  PGMC_78  PGMC_79  PGMC_80 
##     19.4     19.1     20.1     18.7     23.1     18.3     21.6     20.6 
##  PGMC_81  PGMC_82  PGMC_83  PGMC_84  PGMC_85    ALS_1    CTR_1    CTR_2 
##     19.1     19.7     22.9     28.1     25.4     18.2     17.9     17.6 
##    ALS_2    CTR_3    ALS_3  mimic_1    CTR_4    ALS_4    ALS_5    ALS_6 
##     16.0     17.8     17.9     20.9     15.7     16.8     15.8     17.2 
##    NA._1  mimic_2    ALS_7    CTR_5    CTR_6  mimic_3    ALS_8    ALS_9 
##     16.4     15.7     14.1     16.1     15.5     18.3     18.4     18.2 
##    CTR_7    CTR_8    CTR_9    NA._2   CTR_10    NA._3   CTR_11   ALS_10 
##     21.7     16.6     16.8     17.2     18.8     16.7     18.2     16.4 
##   ALS_11   ALS_12   CTR_12    NA._4  mimic_4   ALS_13   CTR_13   ALS_14 
##     16.5     14.8     17.4     14.8     17.4     15.4     16.0     15.4 
##   ALS_15   CTR_14  mimic_5  mimic_6  mimic_7   ALS_16   CTR_15   CTR_16 
##     16.8     17.9     18.3     20.0     19.2     21.9     18.0     17.2 
##  mimic_8   ALS_17   CTR_17   ALS_18   CTR_18  mimic_9   CTR_19   CTR_20 
##     17.0     18.4     17.6     17.2     19.8     19.3     17.5     18.5 
## mimic_10   CTR_21   CTR_22 mimic_11   ALS_19   CTR_23 mimic_12   ALS_20 
##     17.5     17.8     20.1     18.5     16.8     19.2     20.1     17.7 
##   ALS_21   ALS_22   ALS_23   ALS_24   CTR_24   CTR_25   ALS_25   CTR_26 
##     23.8     16.1     19.5     17.3     21.0     19.7     21.1     19.3 
##   CTR_27   ALS_26   ALS_27   CTR_28   CTR_29   CTR_30   CTR_31   CTR_32 
##     20.0     18.9     17.2     16.7     17.3     18.9     19.6     19.3 
##   CTR_33   ALS_28   ALS_29 mimic_13   ALS_30   CTR_34   ALS_31   CTR_35 
##     19.1     19.4     18.3     17.3     19.1     21.1     18.2     20.3 
##   ALS_32   ALS_33   CTR_36   CTR_37   ALS_34   ALS_35   ALS_36    NA._5 
##     18.3     18.7     17.4     20.5     16.9     19.4     17.9     19.7 
##   CTR_38   ALS_37   CTR_39   CTR_40   ALS_38 mimic_14   ALS_39 mimic_15 
##     19.1     16.1     19.1     18.7     20.9     17.9     20.0     20.4 
##   CTR_41   CTR_42   CTR_43   ALS_40   ALS_41   CTR_44    NA._6   ALS_42 
##     21.1     22.1     20.1     22.4     20.1     21.5     20.9     20.5 
##   ALS_43   ALS_44    NA._7    NA._8   CTR_45   ALS_45   CTR_46 mimic_16 
##     22.5     23.4     23.3     20.4     20.2     20.6     23.5     20.1 
## mimic_17   CTR_47   CTR_48   ALS_46   CTR_49   CTR_50 mimic_18   ALS_47 
##     20.0     21.6     21.0     21.6     21.3     20.6     21.3     20.2 
##   CTR_51   CTR_52 mimic_19   CTR_53   CTR_54   CTR_55   CTR_56   ALS_48 
##     21.3     21.5     20.9     21.1     21.6     22.6     21.3     21.9 
##   ALS_49   ALS_50   CTR_57   CTR_58   CTR_59   CTR_60   ALS_51   CTR_61 
##     25.6     24.0     22.8     21.6     23.1     23.2     20.6     24.3 
##   ALS_52   CTR_62   ALS_53   CTR_63   ALS_54   ALS_55   ALS_56   CTR_64 
##     20.9     21.4     23.4     22.1     24.3     22.3     25.1     23.2 
##   ALS_57 mimic_20   ALS_58    NA._9   ALS_59   NA._10   ALS_60   ALS_61 
##     22.1     21.7     22.9     22.8     22.3     24.3     22.1     23.1 
##   ALS_62   ALS_63   ALS_64   ALS_65 mimic_21 mimic_22 mimic_23   CTR_65 
##     22.4     21.4     23.7     22.0     22.8     24.7     25.2     23.4 
##   CTR_66 mimic_24   NA._11   CTR_67   NA._12   CTR_68   NA._13   ALS_66 
##     22.2     23.7     22.9     24.2     22.2     25.4     21.4     23.0 
##   NA._14   CTR_69   ALS_67   CTR_70 mimic_25   NA._15   ALS_68 mimic_26 
##     23.3     21.7     23.3     23.3     21.6     22.2     26.3     22.4 
##   ALS_69   NA._16   CTR_71   CTR_72   NA._17   ALS_70 mimic_27   NA._18 
##     23.2     22.8     23.9     23.2     23.2     25.2     21.6     22.6 
##   ALS_71   CTR_73   CTR_74   CTR_75   ALS_72   CTR_76   NA._19   CTR_77 
##     22.9     20.1     23.1     23.9     26.0     22.3     23.3     23.5 
##   ALS_73   ALS_74   NA._20 mimic_28   NA._21   NA._22   CTR_78   NA._23 
##     23.3     23.4     22.2     23.4     28.9     27.6     27.7     31.1 
##   CTR_79   NA._24   NA._25   ALS_75   ALS_76  PGMC_86   ALS_77 mimic_29 
##     29.3     25.0     25.1     27.0     27.0     25.7     28.9     24.4 
##   CTR_80   ALS_78   ALS_79   ALS_80   ALS_81   ALS_82   NA._26   ALS_83 
##     25.8     27.5     25.8     25.9     28.1     25.7     26.3     25.4 
##   CTR_81   ALS_84   ALS_85   CTR_82   NA._27   ALS_86   ALS_87   NA._28 
##     23.0     24.2     25.3     26.7     23.0     25.7     24.7     25.8 
## mimic_30   ALS_88 mimic_31   ALS_89 
##     26.9     27.1     27.6     23.2
round(apply(X = as.data.frame(assay(lipidomics_data_plasma$se_filtered)), function(x) sum(is.na(x)), MARGIN = 2) / nrow(as.data.frame(assay(lipidomics_data_plasma$se_filtered))) * 100 , 1)
##   PGMC_1   PGMC_2   PGMC_3   PGMC_4   PGMC_5   PGMC_6   PGMC_7   PGMC_8 
##      1.6      2.5      4.8      3.9      2.3      1.8      0.9      1.2 
##   PGMC_9  PGMC_10  PGMC_11  PGMC_12  PGMC_13  PGMC_14  PGMC_15  PGMC_16 
##      1.6      2.6      2.9      3.4      1.8      0.7      1.2      1.3 
##  PGMC_17  PGMC_18  PGMC_19  PGMC_20  PGMC_21  PGMC_22  PGMC_23  PGMC_24 
##      1.5      1.8      6.6      7.9      3.7      6.3      7.6      2.9 
##  PGMC_25  PGMC_26  PGMC_27  PGMC_28  PGMC_29  PGMC_30  PGMC_31  PGMC_32 
##      5.8      5.7      3.9      2.8      1.3      1.6      1.6      1.3 
##  PGMC_33  PGMC_34  PGMC_35  PGMC_36  PGMC_37  PGMC_38  PGMC_39  PGMC_40 
##      1.3      0.7      1.3      2.6      2.3      1.5      2.0      1.3 
##  PGMC_41  PGMC_42  PGMC_43  PGMC_44  PGMC_45  PGMC_46  PGMC_47  PGMC_48 
##      3.4      2.8      0.6      1.2      0.9      2.0      1.3      0.9 
##  PGMC_49  PGMC_50  PGMC_51  PGMC_52  PGMC_53  PGMC_54  PGMC_55  PGMC_56 
##      1.3      1.6      2.9      5.8      1.2      1.0      2.2      1.5 
##  PGMC_57  PGMC_58  PGMC_59  PGMC_60  PGMC_61  PGMC_62  PGMC_63  PGMC_64 
##      1.8      3.5      6.1      1.8      0.4      2.8      2.2      0.6 
##  PGMC_65  PGMC_66  PGMC_67  PGMC_68  PGMC_69  PGMC_70  PGMC_71  PGMC_72 
##      2.0      1.2      1.3      0.4      0.9      1.9      2.3      0.4 
##  PGMC_73  PGMC_74  PGMC_75  PGMC_76  PGMC_77  PGMC_78  PGMC_79  PGMC_80 
##      2.9      1.0      2.8      0.9      5.4      1.3      1.2      1.0 
##  PGMC_81  PGMC_82  PGMC_83  PGMC_84  PGMC_85    ALS_1    CTR_1    CTR_2 
##      0.4      1.0      1.8      7.0      4.2      2.8      0.9      1.8 
##    ALS_2    CTR_3    ALS_3  mimic_1    CTR_4    ALS_4    ALS_5    ALS_6 
##      0.4      0.9      0.9      4.2      1.0      0.6      0.7      1.9 
##    NA._1  mimic_2    ALS_7    CTR_5    CTR_6  mimic_3    ALS_8    ALS_9 
##      1.0      0.6      0.9      1.3      0.7      2.3      1.9      1.9 
##    CTR_7    CTR_8    CTR_9    NA._2   CTR_10    NA._3   CTR_11   ALS_10 
##      5.8      0.3      0.9      0.9      1.3      1.0      1.5      0.9 
##   ALS_11   ALS_12   CTR_12    NA._4  mimic_4   ALS_13   CTR_13   ALS_14 
##      0.6      0.1      1.0      0.4      1.8      0.7      0.1      0.4 
##   ALS_15   CTR_14  mimic_5  mimic_6  mimic_7   ALS_16   CTR_15   CTR_16 
##      0.7      2.0      0.3      2.9      1.9      4.1      0.6      1.0 
##  mimic_8   ALS_17   CTR_17   ALS_18   CTR_18  mimic_9   CTR_19   CTR_20 
##      0.9      1.6      1.0      1.3      1.9      2.0      2.3      1.6 
## mimic_10   CTR_21   CTR_22 mimic_11   ALS_19   CTR_23 mimic_12   ALS_20 
##      0.9      1.5      2.2      0.9      0.3      1.5      1.3      0.6 
##   ALS_21   ALS_22   ALS_23   ALS_24   CTR_24   CTR_25   ALS_25   CTR_26 
##      6.3      0.6      1.3      0.3      2.8      1.9      3.9      1.5 
##   CTR_27   ALS_26   ALS_27   CTR_28   CTR_29   CTR_30   CTR_31   CTR_32 
##      1.9      1.0      0.6      0.4      0.4      2.2      2.0      1.8 
##   CTR_33   ALS_28   ALS_29 mimic_13   ALS_30   CTR_34   ALS_31   CTR_35 
##      2.2      1.5      1.9      0.4      2.5      2.6      1.0      3.5 
##   ALS_32   ALS_33   CTR_36   CTR_37   ALS_34   ALS_35   ALS_36    NA._5 
##      0.7      1.0      0.4      4.1      0.9      2.6      1.6      1.5 
##   CTR_38   ALS_37   CTR_39   CTR_40   ALS_38 mimic_14   ALS_39 mimic_15 
##      1.6      0.1      1.0      2.0      2.9      0.9      1.2      1.3 
##   CTR_41   CTR_42   CTR_43   ALS_40   ALS_41   CTR_44    NA._6   ALS_42 
##      1.2      2.0      0.7      1.5      1.2      2.2      0.9      1.2 
##   ALS_43   ALS_44    NA._7    NA._8   CTR_45   ALS_45   CTR_46 mimic_16 
##      2.8      3.8      2.9      0.6      1.0      2.0      3.9      0.6 
## mimic_17   CTR_47   CTR_48   ALS_46   CTR_49   CTR_50 mimic_18   ALS_47 
##      0.9      1.2      0.9      2.2      1.9      0.6      0.9      1.2 
##   CTR_51   CTR_52 mimic_19   CTR_53   CTR_54   CTR_55   CTR_56   ALS_48 
##      0.7      1.0      0.9      0.6      1.2      2.5      0.9      1.2 
##   ALS_49   ALS_50   CTR_57   CTR_58   CTR_59   CTR_60   ALS_51   CTR_61 
##      5.4      3.5      1.8      1.6      1.0      2.5      0.6      4.1 
##   ALS_52   CTR_62   ALS_53   CTR_63   ALS_54   ALS_55   ALS_56   CTR_64 
##      0.6      1.3      4.5      1.9      3.1      1.3      3.9      2.3 
##   ALS_57 mimic_20   ALS_58    NA._9   ALS_59   NA._10   ALS_60   ALS_61 
##      1.5      1.6      2.3      1.9      1.9      2.9      1.6      2.0 
##   ALS_62   ALS_63   ALS_64   ALS_65 mimic_21 mimic_22 mimic_23   CTR_65 
##      1.8      0.7      2.0      1.3      1.9      4.5      3.9      2.2 
##   CTR_66 mimic_24   NA._11   CTR_67   NA._12   CTR_68   NA._13   ALS_66 
##      1.5      2.6      2.0      2.9      2.2      3.9      1.2      1.9 
##   NA._14   CTR_69   ALS_67   CTR_70 mimic_25   NA._15   ALS_68 mimic_26 
##      2.2      1.2      3.1      2.6      1.2      1.8      6.0      1.5 
##   ALS_69   NA._16   CTR_71   CTR_72   NA._17   ALS_70 mimic_27   NA._18 
##      1.9      2.2      3.1      2.2      2.6      4.5      1.2      1.8 
##   ALS_71   CTR_73   CTR_74   CTR_75   ALS_72   CTR_76   NA._19   CTR_77 
##      1.6      0.9      2.8      3.5      6.1      2.2      2.0      2.8 
##   ALS_73   ALS_74   NA._20 mimic_28   NA._21   NA._22   CTR_78   NA._23 
##      3.4      2.6      1.5      2.5      8.0      6.7      6.4     10.8 
##   CTR_79   NA._24   NA._25   ALS_75   ALS_76  PGMC_86   ALS_77 mimic_29 
##      9.1      4.2      4.2      6.6      5.7      4.8      8.8      3.2 
##   CTR_80   ALS_78   ALS_79   ALS_80   ALS_81   ALS_82   NA._26   ALS_83 
##      4.5      7.0      4.7      4.7      7.9      4.7      4.7      4.5 
##   CTR_81   ALS_84   ALS_85   CTR_82   NA._27   ALS_86   ALS_87   NA._28 
##      2.2      3.4      4.4      5.4      2.5      4.7      3.7      4.2 
## mimic_30   ALS_88 mimic_31   ALS_89 
##      6.3      6.7      6.4      2.6
#normalization
lipidomics_data_plasma$se_filt_norm <- normalize_vsn(lipidomics_data_plasma$se_filtered)
DEP::meanSdPlot(lipidomics_data_plasma$se_filt_norm)

writexl::write_xlsx(as.data.frame(assay(lipidomics_data_plasma$se_filtered)),"results/data_filtered_plasma.xlsx")
writexl::write_xlsx(as.data.frame(assay(lipidomics_data_plasma$se_filt_norm)),"results/data_filtered_normalized_plasma.xlsx")

# imputation
lipidomics_data_plasma$se_filt_impute <- impute(
  lipidomics_data_plasma$se_filt_norm,fun = "MinProb",q = 0.01)
## Imputing along margin 2 (samples/columns).
## [1] 0.6924497
writexl::write_xlsx(as.data.frame(assay(lipidomics_data_plasma$se_filt_impute)),"results/data_filtered_normalized_imputed_plasma.xlsx")

Step 7: Visualizations

The seventh step is to create visualizations, such as heatmaps and PCAs (for each fluid with ID/batch info, and for both fluids together)

- CSF

## Distribution of expression values per ID and tube number
mean_expression_plot(data = t(assay(lipidomics_data_CSF$se)), 
                      file_sample = "plots/boxplots_expression_each_sample_CSF.pdf",
                      file_mass = "plots/boxplots_expression_each_lipid_CSF.pdf",
                      title = "CSF lipidomics")

## Heatmaps
# CSF
title = "lipidomics CSF" 

data = assay(lipidomics_data_CSF$se)
#row annotation
names_lipids = rownames(data)
Lipids_final <- read_excel("data input/RE_ metabolomics/Lipids_final.xlsx")
lipids = Lipids_final %>% 
  select(Name,Category) %>%
  filter(Name %in% names_lipids)
lipids = lipids[match(lipids$Name,names_lipids),]
annotation_row = data.frame(lipid_type = as.factor(lipids$Category))
rownames(annotation_row) = lipids$Name
mycolors = brewer.pal(length(unique(annotation_row$lipid_type)), "Set1")
names(mycolors) <- unique(annotation_row$lipid_type)
annotation_colours <- list(
  lipid_type = list(mycolors)[[1]])
        
#without grouping, all lipids
p = make_pheatmap(data = data, 
                  cluster_cols = F, 
                  main = paste0("Heatmap all lipids\n",title, "\n not clustered"), 
                  show_rownames = F,
                  labels_col = colnames(data))

save_pheatmap_pdf(p, filename = paste0("plots/heatmap_raw_",title,".pdf"))
## quartz_off_screen 
##                 2
#without grouping, filtered lipids
data = assay(lipidomics_data_CSF$se_filt_norm)

#row annotation
names_lipids = rownames(data)
lipids = Lipids_final %>% 
  select(Name,Category) %>%
  filter(Name %in% names_lipids)
lipids = lipids[match(lipids$Name,names_lipids),]
annotation_row = data.frame(lipid_type = as.factor(lipids$Category))
rownames(annotation_row) = lipids$Name
mycolors = brewer.pal(length(unique(annotation_row$lipid_type)), "Set1")
names(mycolors) <- unique(annotation_row$lipid_type)
annotation_colours <- list(
  lipid_type = list(mycolors)[[1]])

p = make_pheatmap(data = data, 
                  cluster_cols = F, 
                  main = paste0("Heatmap filtered lipids\n",title, "\n not clustered"), 
                  show_rownames = F,
                  labels_col = colnames(data))

save_pheatmap_pdf(p, filename = paste0("plots/heatmap_filtered_",title,".pdf"))
## quartz_off_screen 
##                 2
#without grouping, filtered and imputed
data = assay(lipidomics_data_CSF$se_filt_impute)

#row annotation
names_lipids = rownames(data)
lipids = Lipids_final %>% 
  select(Name,Category) %>%
  filter(Name %in% names_lipids)
lipids = lipids[match(lipids$Name,names_lipids),]
annotation_row = data.frame(lipid_type = as.factor(lipids$Category))
rownames(annotation_row) = lipids$Name
mycolors = brewer.pal(length(unique(annotation_row$lipid_type)), "Set1")
names(mycolors) <- unique(annotation_row$lipid_type)
annotation_colours <- list(
  lipid_type = list(mycolors)[[1]])

p = make_pheatmap(data = data, 
                  cluster_cols = F, 
                  main = paste0("Heatmap imputed lipids\n",title, "\n not clustered"), 
                  show_rownames = F,
                  labels_col = colnames(data))

save_pheatmap_pdf(p, filename = paste0("plots/heatmap_imputed_",title,".pdf"))
## quartz_off_screen 
##                 2
# without grouping, 100 most variable lipids
d = assay(lipidomics_data_CSF$se)
d2 = head(order(rowVars(d),decreasing = T),100)

data = d[d2,]
#row annotation
names_lipids = rownames(data)
lipids = Lipids_final %>% 
  select(Name,Category) %>%
  filter(Name %in% names_lipids)
lipids = lipids[match(lipids$Name,names_lipids),]
annotation_row = data.frame(lipid_type = as.factor(lipids$Category))
rownames(annotation_row) = lipids$Name
mycolors = brewer.pal(length(unique(annotation_row$lipid_type)), "Set1")
names(mycolors) <- unique(annotation_row$lipid_type)
annotation_colours <- list(
  lipid_type = list(mycolors)[[1]])

p = make_pheatmap(data = d[d2,], 
                  cluster_cols = F, 
                  main = paste0("Heatmap 100 most variable lipids\n",title, "\nnot clustered"),
                  labels_col = colnames(d))

save_pheatmap_pdf(p, filename = paste0("plots/heatmap_raw_mostvar_",title,".pdf"))
## quartz_off_screen 
##                 2
## UMAP (with participant status type)
d_CSF = t(assay(lipidomics_data_CSF$se_filt_impute))
labels_group = tube_batch_CSF$type
group = c("mediumpurple1", "darksalmon", "yellow4","blue4","seagreen4")
UMAP_density_plot(data = d_CSF,
                          ggtitle = paste0("UMAP with type labels\n", title),
                          legend_name = "Fluid labels",
                          labels = labels_group,
                          file_location = paste0("plots/UMAP_type_",title,".pdf"),
                          file_location_labels =paste0("plots/UMAP_type_labels_",title,".pdf"),
                          colour_set = group)

# UMAP (with batch info)
labels_group = tube_batch_CSF$Batch
group = c("#3DB7E4", "#FF8849", "#69BE28")
UMAP_density_plot(data = d_CSF,
                          ggtitle = paste0("UMAP with batch labels\n", title),
                          legend_name = "Fluid labels",
                          labels = labels_group,
                          file_location = paste0("plots/UMAP_batch_",title,".pdf"),
                          file_location_labels =paste0("plots/UMAP_batch_labels_",title,".pdf"),
                          colour_set = group)

- Plasma

## Distribution of expression values per ID and tube number
mean_expression_plot(data = t(assay(lipidomics_data_plasma$se)), 
                      file_sample = "plots/boxplots_expression_each_sample_plasma.pdf",
                      file_mass = "plots/boxplots_expression_each_lipid_plasma.pdf",
                      title = "plasma lipidomics")

## Heatmaps
# plasma
title = "lipidomics plasma" 

data = assay(lipidomics_data_plasma$se)
#row annotation
names_lipids = rownames(data)
Lipids_final <- read_excel("data input/RE_ metabolomics/Lipids_final.xlsx")
lipids = Lipids_final %>% 
  select(Name,Category) %>%
  filter(Name %in% names_lipids)
lipids = lipids[match(lipids$Name,names_lipids),]
annotation_row = data.frame(lipid_type = as.factor(lipids$Category))
rownames(annotation_row) = lipids$Name
mycolors = brewer.pal(length(unique(annotation_row$lipid_type)), "Set1")
names(mycolors) <- unique(annotation_row$lipid_type)
annotation_colours <- list(
  lipid_type = list(mycolors)[[1]])
        
#without grouping, all lipids
p = make_pheatmap(data = data, 
                  cluster_cols = F, 
                  main = paste0("Heatmap all lipids\n",title, "\n not clustered"), 
                  show_rownames = F,
                  labels_col = colnames(data))

save_pheatmap_pdf(p, filename = paste0("plots/heatmap_raw_",title,".pdf"))
## quartz_off_screen 
##                 2
#without grouping, filtered lipids
data = assay(lipidomics_data_plasma$se_filt_norm)

#row annotation
names_lipids = rownames(data)
lipids = Lipids_final %>% 
  select(Name,Category) %>%
  filter(Name %in% names_lipids)
lipids = lipids[match(lipids$Name,names_lipids),]
annotation_row = data.frame(lipid_type = as.factor(lipids$Category))
rownames(annotation_row) = lipids$Name
mycolors = brewer.pal(length(unique(annotation_row$lipid_type)), "Set1")
names(mycolors) <- unique(annotation_row$lipid_type)
annotation_colours <- list(
  lipid_type = list(mycolors)[[1]])

p = make_pheatmap(data = data, 
                  cluster_cols = F, 
                  main = paste0("Heatmap filtered lipids\n",title, "\n not clustered"), 
                  show_rownames = F,
                  labels_col = colnames(data))

save_pheatmap_pdf(p, filename = paste0("plots/heatmap_filtered_",title,".pdf"))
## quartz_off_screen 
##                 2
#without grouping, filtered and imputed
data = assay(lipidomics_data_plasma$se_filt_impute)

#row annotation
names_lipids = rownames(data)
lipids = Lipids_final %>% 
  select(Name,Category) %>%
  filter(Name %in% names_lipids)
lipids = lipids[match(lipids$Name,names_lipids),]
annotation_row = data.frame(lipid_type = as.factor(lipids$Category))
rownames(annotation_row) = lipids$Name
mycolors = brewer.pal(length(unique(annotation_row$lipid_type)), "Set1")
names(mycolors) <- unique(annotation_row$lipid_type)
annotation_colours <- list(
  lipid_type = list(mycolors)[[1]])

p = make_pheatmap(data = data, 
                  cluster_cols = F, 
                  main = paste0("Heatmap imputed lipids\n",title, "\n not clustered"), 
                  show_rownames = F,
                  labels_col = colnames(data))

save_pheatmap_pdf(p, filename = paste0("plots/heatmap_imputed_",title,".pdf"))
## quartz_off_screen 
##                 2
# without grouping, 100 most variable lipids
d = assay(lipidomics_data_plasma$se)
d2 = head(order(rowVars(d),decreasing = T),100)

data = d[d2,]
#row annotation
names_lipids = rownames(data)
lipids = Lipids_final %>% 
  select(Name,Category) %>%
  filter(Name %in% names_lipids)
lipids = lipids[match(lipids$Name,names_lipids),]
annotation_row = data.frame(lipid_type = as.factor(lipids$Category))
rownames(annotation_row) = lipids$Name
mycolors = brewer.pal(length(unique(annotation_row$lipid_type)), "Set1")
names(mycolors) <- unique(annotation_row$lipid_type)
annotation_colours <- list(
  lipid_type = list(mycolors)[[1]])

p = make_pheatmap(data = d[d2,], 
                  cluster_cols = F, 
                  main = paste0("Heatmap 100 most variable lipids\n",title, "\nnot clustered"),
                  labels_col = colnames(d))

save_pheatmap_pdf(p, filename = paste0("plots/heatmap_raw_mostvar_",title,".pdf"))
## quartz_off_screen 
##                 2
## UMAP (with participant status type)
d_plasma = t(assay(lipidomics_data_plasma$se_filt_impute))
labels_group = tube_batch_plasma$type
group = c("mediumpurple1", "darksalmon", "yellow4","blue4","seagreen4")
UMAP_density_plot(data = d_plasma,
                          ggtitle = paste0("UMAP with type labels\n", title),
                          legend_name = "Fluid labels",
                          labels = labels_group,
                          file_location = paste0("plots/UMAP_type_",title,".pdf"),
                          file_location_labels =paste0("plots/UMAP_type_labels_",title,".pdf"),
                          colour_set = group)

# UMAP (with batch info)
labels_group = tube_batch_plasma$Batch
group = c("#3DB7E4", "#FF8849", "#69BE28","#757083","#CF7684","#A0C8C0","#CDAE3B")
UMAP_density_plot(data = d_plasma,
                          ggtitle = paste0("UMAP with batch labels\n", title),
                          legend_name = "Fluid labels",
                          labels = labels_group,
                          file_location = paste0("plots/UMAP_batch_",title,".pdf"),
                          file_location_labels =paste0("plots/UMAP_batch_labels_",title,".pdf"),
                          colour_set = group)

  • CSF and Plasma together
 proteins_in_both_fluids = colnames(d_CSF)[colnames(d_CSF) %in% colnames(d_plasma)]
  
  d_1 = d_CSF[,proteins_in_both_fluids]
  d_2 = d_plasma[,proteins_in_both_fluids]
  
  d = do.call("rbind",list(d_1,d_2))
  
  labels_group = c(rep("CSF", 144), rep("Plasma", 316))
  title = "plasma_vs_CSF"
  d <- is.na(d)
  
  dim(d)
## [1] 460 183
#perform plots with function
UMAP_density_plot(data = d,
                  ggtitle = paste0("UMAP with fluid labels\n", title),
                  legend_name = "Fluid labels",
                  labels = labels_group,
                  file_location = paste0("plots/UMAP_fluid_group_lipids_",title,".pdf"),
                  file_location_labels=paste0("plots/UMAP_fluid_group_labels_",title,".pdf"),
                  colour_set = group)

Step 8

The eighth step is to perform differential expression analyses, with particular interest of eALS vs CTR, PGMC vs CTR. For this, the information between IDs and the status (eALS, CTR, PGMC, mimic) have to be collected from other datasets